library(dplyr)
library(readr)
library(ggplot2)
library(openxlsx)
library(knitr)
library(tibble)
library(stringr)
library(stringi)
library(readxl)
library(lubridate)
library(shiny)
library(plotly)
library(ruODK)

1 Loading the Data and Removal of Training Data

# Unzip and extract ODK data from ODK zip
df <- export_load_from_odk(params$svc)
## <ruODK settings>
##   Default ODK Central Project ID: 2 
##   Default ODK Central Form ID: 07-TIMCI-timeflow 
##   Default ODK Central URL: https://timicodktest.smartforest.de 
##   Default ODK Central Username: lucas.silbernagel@swisstph.ch 
##   Default ODK Central Password: run ruODK::get_default_pw() to show 
##   Default ODK Central Passphrase: run ruODK::get_default_pp() to show 
##   Default Time Zone: Europe/Berlin 
##   Default ODK Central Version: 1.1 
##   Default HTTP GET retries: 3 
##   Verbose messages: TRUE 
##   Test ODK Central Project ID:  
##   Test ODK Central Form ID:  
##   Test ODK Central Form ID (ZIP tests):  
##   Test ODK Central Form ID (Attachment tests):  
##   Test ODK Central Form ID (Parsing tests):  
##   Test ODK Central Form ID (WKT tests):  
##   Test ODK Central URL:  
##   Test ODK Central Username:  
##   Test ODK Central Password: run ruODK::get_test_pw() to show 
##   Test ODK Central Passphrase: run ruODK::get_test_pp() to show 
##   Test ODK Central Version: 1.1
# Formatting dates from integer (in ms) to time stamp
df$start <- format_date_ms(df$start)
df$end <- format_date_ms(df$end)

head(df)
instance.ID event node start end latitude longitude accuracy old.value new.value
uuid:2985578d-e410-4a14-bd31-f6813536d5c8 form start NA 2021-01-23 20:50:14 NA NA NA NA NA NA
uuid:2985578d-e410-4a14-bd31-f6813536d5c8 group questions /data/front_page 2021-01-23 20:50:14 2021-01-23 20:50:18 NA NA NA NA NA
uuid:2985578d-e410-4a14-bd31-f6813536d5c8 group questions /data/visit_start 2021-01-23 20:50:18 2021-01-23 20:50:22 NA NA NA NA NA
uuid:2985578d-e410-4a14-bd31-f6813536d5c8 question /data/visit_start/b1_4 2021-01-23 20:50:18 2021-01-23 20:50:22 NA NA NA NA Joal
uuid:2985578d-e410-4a14-bd31-f6813536d5c8 question /data/steps[1]/step_type 2021-01-23 20:50:22 2021-01-23 20:50:27 NA NA NA NA 1
uuid:2985578d-e410-4a14-bd31-f6813536d5c8 question /data/steps[2]/step_type 2021-01-23 20:50:27 2021-01-23 20:50:31 NA NA NA NA 1
# filtering for events that occurred after 18th July 21
#df <- subset(df, as.Date(start) > as.Date("18.07.2021", "%d.%m.%Y"))

2 Deriving New Features

2.1 Time Spent per Event

# subtracting end from start date
df$time_spent = round(as.numeric(df$end - df$start))

2.2 Question

# splitting the node strings so that only the question name remains 
df$question = sapply(df$node, create_question)

2.3 Question Decoded

df <- decode_question(df, df$question, params$svc)
## <ruODK settings>
##   Default ODK Central Project ID: 2 
##   Default ODK Central Form ID: 07-TIMCI-timeflow 
##   Default ODK Central URL: https://timicodktest.smartforest.de 
##   Default ODK Central Username: lucas.silbernagel@swisstph.ch 
##   Default ODK Central Password: run ruODK::get_default_pw() to show 
##   Default ODK Central Passphrase: run ruODK::get_default_pp() to show 
##   Default Time Zone: Europe/Berlin 
##   Default ODK Central Version: 1.1 
##   Default HTTP GET retries: 3 
##   Verbose messages: TRUE 
##   Test ODK Central Project ID:  
##   Test ODK Central Form ID:  
##   Test ODK Central Form ID (ZIP tests):  
##   Test ODK Central Form ID (Attachment tests):  
##   Test ODK Central Form ID (Parsing tests):  
##   Test ODK Central Form ID (WKT tests):  
##   Test ODK Central URL:  
##   Test ODK Central Username:  
##   Test ODK Central Password: run ruODK::get_test_pw() to show 
##   Test ODK Central Passphrase: run ruODK::get_test_pp() to show 
##   Test ODK Central Version: 1.1

2.4 Categorical Answers Decoded

df <- decode_categories(df, params$svc)
## <ruODK settings>
##   Default ODK Central Project ID: 2 
##   Default ODK Central Form ID: 07-TIMCI-timeflow 
##   Default ODK Central URL: https://timicodktest.smartforest.de 
##   Default ODK Central Username: lucas.silbernagel@swisstph.ch 
##   Default ODK Central Password: run ruODK::get_default_pw() to show 
##   Default ODK Central Passphrase: run ruODK::get_default_pp() to show 
##   Default Time Zone: Europe/Berlin 
##   Default ODK Central Version: 1.1 
##   Default HTTP GET retries: 3 
##   Verbose messages: TRUE 
##   Test ODK Central Project ID:  
##   Test ODK Central Form ID:  
##   Test ODK Central Form ID (ZIP tests):  
##   Test ODK Central Form ID (Attachment tests):  
##   Test ODK Central Form ID (Parsing tests):  
##   Test ODK Central Form ID (WKT tests):  
##   Test ODK Central URL:  
##   Test ODK Central Username:  
##   Test ODK Central Password: run ruODK::get_test_pw() to show 
##   Test ODK Central Passphrase: run ruODK::get_test_pp() to show 
##   Test ODK Central Version: 1.1

2.5 Time until a Response was Changed + Stream of Answer Changes

df <- df %>%
# bringing the data in the right order   
  arrange(instance.ID, node, start) %>%
# adding two empty columns to store the new features in
  add_column(time_till_change=NA) %>%
  add_column(changed_from=NA)

# iterating over the df and computing the time it took until an answer was changed + adding what the question was before 
for (i in 1:nrow(df)){
  if (df$old.value[i]==df$new.value[i-1] && !is.na(df$old.value[i]) && !is.na(df$new.value[i-1]) ){
    df$time_till_change[i] <- round(as.numeric(df$start[i]-df$end[i-1]))
  } else{
    next
  }
}

2.6 Preview and Summary of the Final Data

head(df)
instance.ID event node start end latitude longitude accuracy old.value new.value time_spent question question_decoded new_value_decoded old_value_decoded time_till_change changed_from
uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a group questions /data/child_identification 2021-04-23 18:59:31 2021-04-23 19:00:00 NA NA NA NA NA 29 child_identification child_identification NA NA NA NA
uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a question /data/child_identification/a1_a_4a 2021-04-23 18:59:31 2021-04-23 19:00:00 NA NA NA NA S-F009-P0035 29 a1_a_4a If QR code scanning is not possible, please manually enter the participant identification code S-F009-P0035 NA NA NA
uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a group questions /data/front_page 2021-04-23 18:58:54 2021-04-23 18:58:55 NA NA NA NA NA 1 front_page front_page NA NA NA NA
uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a question /data/steps[1]/step_type 2021-04-23 18:59:03 2021-04-23 18:59:06 NA NA NA NA 3 3 step_type step_type triage NA NA NA
uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a question /data/steps[2]/step_type 2021-04-23 18:59:06 2021-04-23 18:59:09 NA NA NA NA 5 3 step_type step_type laboratory testing NA NA NA
uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a question /data/steps[3]/step_type 2021-04-23 18:59:09 2021-04-23 18:59:12 NA NA NA NA 2 2 step_type step_type waiting NA NA NA
summary(df)
##  instance.ID           event               node               start                    
##  Length:64          Length:64          Length:64          Min.   :2021-01-23 20:50:14  
##  Class :character   Class :character   Class :character   1st Qu.:2021-01-30 01:15:55  
##  Mode  :character   Mode  :character   Mode  :character   Median :2021-02-09 08:15:54  
##                                                           Mean   :2021-02-23 02:09:42  
##                                                           3rd Qu.:2021-04-23 18:58:54  
##                                                           Max.   :2021-04-23 19:00:03  
##                                                                                        
##       end                      latitude       longitude      accuracy         old.value   new.value        
##  Min.   :2021-01-23 20:50:18   Mode:logical   Mode:logical   Mode:logical   Min.   :1    Length:64         
##  1st Qu.:2021-01-30 01:16:16   NA's:64        NA's:64        NA's:64        1st Qu.:1    Class :character  
##  Median :2021-02-09 08:50:44                                                Median :1    Mode  :character  
##  Mean   :2021-02-23 18:49:15                                                Mean   :1                      
##  3rd Qu.:2021-04-23 18:58:57                                                3rd Qu.:1                      
##  Max.   :2021-04-23 19:00:03                                                Max.   :1                      
##  NA's   :16                                                                 NA's   :63                     
##    time_spent        question         question_decoded   new_value_decoded  old_value_decoded 
##  Min.   :   1.00   Length:64          Length:64          Length:64          Length:64         
##  1st Qu.:   3.75   Class :character   Class :character   Class :character   Class :character  
##  Median :   8.00   Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :  60.42                                                                              
##  3rd Qu.:  18.00                                                                              
##  Max.   :2073.00                                                                              
##  NA's   :16                                                                                   
##  time_till_change changed_from  
##  Mode:logical     Mode:logical  
##  NA's:64          NA's:64       
##                                 
##                                 
##                                 
##                                 
## 

3 General Information about the Data

no_inst = length(unique(df$instance.ID))
no_event =  nrow(df)
earliest_start = as.Date(min(df$start)) 
latest_end = as.Date(max(df$end[!is.na(df$end)]))

Total number of instances: 4
Total number of events/questions: 64
Examination period: 2021-01-23 - 2021-04-23

4 Grouped by Time

4.1 Events/Questions Started by Day

df_by_day <- df %>%
  mutate(start_date = as.Date(start)) %>%
  count(start_date, name = "count")

gg1 <- ggplot(df_by_day, aes(x = start_date, y = count)) +
  geom_line() +
  geom_smooth(alpha=0.5, colour="red", method="loess", se=F) +
  labs(title = "Number of Events/Questions Started by Day with Smoothed Regression Line", y =  "Number of Questions/Events Started", x = "Satrt Date") +
  theme_light() 
gg1

4.2 Questions/Events started by Weekday and Hour of the Day

df_wday_hour <- df %>%
  mutate(wday=wday(start, label=T, week_start = 1), hour=hour(start)) %>%
  count(wday, hour, name="count_wday_hour") %>%
  arrange(desc(wday))

theme_heatmap <- theme_light() +                 
  theme(panel.grid = element_blank(),            
        panel.border = element_blank(),          
        plot.title = element_text(face = "bold", size = 11, hjust = 0.5), 
        axis.ticks = element_blank(),            
        axis.title.x = element_blank(),        
        axis.title.y = element_text(size=10),   
        axis.text.y = element_text(size = 8),    
        axis.text.x = element_text(size = 10),   
        legend.position = "none")                

gg2 <- ggplot(df_wday_hour, aes(x = wday, y = hour, fill = count_wday_hour)) +
  geom_tile(colour="white") +  
  scale_fill_gradient(low = "#fff0f0", high="#940606") +  
  scale_y_reverse(breaks=c(23:0), labels=c(23:0), expand = c(0,0)) +               
  scale_x_discrete(expand = c(0,0), position = "top") +
  labs(title = "Number of Started Events/Questions by Day of Week / Hour of Day", y = "Hour of Day") +
  geom_text(aes(label = count_wday_hour), size = 2) +
  theme_heatmap  
gg2

4.3 Distribution of Time Spent per Event/Question with largest 5 % removed

df_clean = subset(df, time_spent<quantile(df$time_spent,0.95, na.rm=TRUE))

hist(df_clean$time_spent[!is.na(df_clean$time_spent)]/60, breaks=20, xlab = "Time Spent in Minutes", main = "Histogram of the Time Spent by Question")

5 Aggregated by Event/Question

5.1 Median Time Spent by Question

df_median_time_per_question <- df %>%
  filter(event=="question") %>%
  group_by(question_decoded) %>%
  summarise(median_time_spent = median(time_spent)) %>%
  arrange(desc(median_time_spent)) %>%
  mutate(median_time_spent = round(seconds_to_period(median_time_spent)))

df_median_time_per_question
question_decoded median_time_spent
If QR code scanning is not possible, please manually enter the participant identification code 52S
Please scan the participant’s QR code 21S
step_other 16S
Please select the current district 12S
step_type 8S
step_end 4S

5.2 Count of Input Changes and Median Time until Input was Changed by Question

df_changes_per_question <- df %>%
  filter(event=="question", 
         !is.na(time_till_change)) %>%
  group_by(question_decoded) %>%
  summarise(count_input_changes=n(), 
            median_time_till_change=median(time_till_change), 
            sd_time_till_change=sd(time_till_change)) %>%
  arrange(desc(count_input_changes)) %>%
  mutate(median_time_till_change = round(seconds_to_period(median_time_till_change)),
         sd_time_till_change = round(seconds_to_period(sd_time_till_change), 1)) %>%
  filter(count_input_changes > 1)

df_changes_per_question
question_decoded count_input_changes median_time_till_change sd_time_till_change

5.3 Count of Old-New Value Pairs

df_stream <- df %>%
  filter(!is.na(time_till_change)) %>%
  count(question_decoded, 
        old_value_decoded, 
        new_value_decoded, 
        name="count_value_pairs", 
        sort=TRUE) %>%
  filter(count_value_pairs > 1)

df_stream
question_decoded old_value_decoded new_value_decoded count_value_pairs

6 Aggregated by Instance

6.1 Top 10 % of Duration by Instance

df_duration_per_inst <- df %>%
  group_by(instance.ID) %>%
  summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>% 
  filter(duration_per_inst>quantile(duration_per_inst, 0.9, na.rm=TRUE)) %>%
  mutate(duration_per_inst = round(seconds_to_period(duration_per_inst))) %>%
  arrange(desc(duration_per_inst))

df_duration_per_inst
instance.ID duration_per_inst
uuid:1fe2c870-db0a-4f66-86f4-8418f3a0372f 43S

6.2 Distribution of Duration by Instance with Top 10 % excluded

df_subsetted <- df %>%
  group_by(instance.ID) %>%
  summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>%
  filter(duration_per_inst<quantile(duration_per_inst, 0.9, na.rm=TRUE))
 
hist(as.numeric(df_subsetted$duration_per_inst/60), breaks=30, main="Duration per Instance in Minutes (outliers removed)", xlab="Duration in Minutes")

7 Irregularities and Outliers

7.1 Time Till Change Outliers (for all data without removed outliers)

df_time_till_change_outliers <- df %>% 
  filter(time_till_change>quantile(df$time_till_change, 0.9, na.rm=TRUE)) %>% 
  arrange(desc(time_till_change)) %>%
  mutate(time_till_change = round(seconds_to_period(time_till_change))) %>%
  select(instance.ID, 
         question_decoded, 
         old_value_decoded, 
         new_value_decoded, 
         time_till_change)

df_time_till_change_outliers
instance.ID question_decoded old_value_decoded new_value_decoded time_till_change

7.2 Histograms of Instances with Inconsistent Filling Behaviour

irregular_inst = c()
for (id in unique(df$instance.ID)){
  bin_vec = cut(df$start[df$instance.ID==id], 
                breaks=10, 
                labels=F)
  if (length(unique(bin_vec)) < 5) irregular_inst = c(irregular_inst, id)
}
paste0(length(irregular_inst), " out of ", length(unique(df$instance.ID))," instances were found to have an inconsistent filling behaviour.")
## [1] "1 out of 4 instances were found to have an inconsistent filling behaviour."
last_bin_questions = c()
fig <- plot_ly(alpha=0.1)
for (id in irregular_inst){
  temp_df = df[df$instance.ID==id,]
  temp_df$cut = cut(temp_df$start, breaks=10, labels=c("1. Part", "2. Part", "3. Part", "4. Part", "5. Part", "6. Part", "7. Part", "8. Part", "9. Part", "10. Part"))
  fig <- fig %>% add_histogram(x=temp_df$cut, name=id)
  
  last_bin_questions = c(last_bin_questions, temp_df$question_decoded[temp_df$cut=="10. Part"])
}
fig <- fig %>% layout(barmode = "overlay")
fig
kable(table(last_bin_questions) %>% as.data.frame() %>% arrange(desc(Freq)))
last_bin_questions Freq
child_identification 1
If QR code scanning is not possible, please manually enter the participant identification code 1
step_end 1
step_other 1
summary 1